nipah_RBP_entry_analysis.ipynb¶

This notebook pulls in data on Nipah RBP entry into CHO-bEFNB2 and bCHO-EFNB3 cells, calculates stats, and makes figures¶

  • Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
nipah_config = None
altair_config = None
surface = None

e2_distances_file = None
func_scores_E2_file = None
func_scores_E3_file = None
concat_df_file = None
merged_df_file = None

contact_type_plot = None
E2_E3_entry_corr_plot = None
E2_E3_entry_all_muts_plot = None
combined_E2_E3_correlation_plots = None
entry_region_boxplot_plot = None
In [2]:
# Parameters
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
e2_distances_file = "results/distances/2vsm_distances.csv"
contact_type_plot = "results/images/contact_type_plot.html"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
E2_E3_entry_corr_plot = "results/images/E2_E3_entry_corr_plot.html"
E2_E3_entry_all_muts_plot = "results/images/E2_E3_entry_all_muts_plot.html"
combined_E2_E3_correlation_plots = (
    "results/images/combined_E2_E3_correlation_plots.html"
)
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entry_region_boxplot_plot = "results/images/entry_region_boxplot_plot.html"
surface = "data/custom_analyses_data/surface_exposure.csv"
In [3]:
import math
import os
import re
import altair as alt

import numpy as np

import pandas as pd

import scipy.stats

import Bio.SeqIO

import yaml

from Bio import AlignIO
from Bio import PDB
from Bio.Align import PairwiseAligner
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")
Setup in correct directory

Paths for running notebook interactively¶

In [5]:
if nipah_config is None:
    e2_distances_file = "results/distances/2vsm_distances.csv"
    func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
    func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
    
    concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
    merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
    
    nipah_config = "nipah_config.yaml"
    altair_config = "data/custom_analyses_data/theme.py"
    
    surface = "data/custom_analyses_data/surface_exposure.csv"

Read in custom altair theme and import YAML file with parameters¶

In [6]:
if altair_config:
    with open(altair_config, 'r') as file:
        exec(file.read())

with open(nipah_config) as f:
    config = yaml.safe_load(f)

Import filtered data¶

In [7]:
#Import filtered entry scores from different selections
func_scores_E2 = pd.read_csv(func_scores_E2_file)
func_scores_E3 = pd.read_csv(func_scores_E3_file)
concat_df = pd.read_csv(concat_df_file) #concatanated entry scores
merged_df = pd.read_csv(merged_df_file) #merged entry scores

Calculate some basic stats about E2 and E3 entry scores¶

In [8]:
def calculate_stats(df, name):
    print(f"For {name}:")
    total_mut = (532) * 19 #total number of possible amino acids
    print(f'There are {total_mut} amino acid mutations possible')
    muts_present = df["effect"].shape[0]
    fraction_muts = muts_present / total_mut
    print(
        f"fraction muts present in {name} is {fraction_muts:.2f} {muts_present}/{total_mut}"
    )

    # Break mutations into bins (negative, neutral, positive) and calculate how many mutants there are
    deleterious_muts = df[df["effect"] <= -0.5].shape[0] 
    neutral_muts = df[(df["effect"] > -0.5) & (df["effect"] < 0.5)].shape[0]
    positive_muts = df[df["effect"] > 0.5].shape[0]

    frac_bad_muts = deleterious_muts / muts_present
    frac_neutral_muts = neutral_muts / muts_present
    frac_pos_muts = positive_muts / muts_present
    print(
        f"The number of deleterious mutants for {name} is {frac_bad_muts:.2f} {deleterious_muts}/{muts_present}"
    )
    print(
        f"The number of neutral mutants for {name} is {frac_neutral_muts:.2f} {neutral_muts}/{muts_present}"
    )
    print(
        f"The number of positive mutants for {name} is {frac_pos_muts:.2f} {positive_muts}/{muts_present}\n"
    )

calculate_stats(func_scores_E2, "CHO-EFNB2")
calculate_stats(func_scores_E3, "CHO-EFNB3")
For CHO-EFNB2:
There are 10108 amino acid mutations possible
fraction muts present in CHO-EFNB2 is 0.97 9786/10108
The number of deleterious mutants for CHO-EFNB2 is 0.44 4307/9786
The number of neutral mutants for CHO-EFNB2 is 0.54 5281/9786
The number of positive mutants for CHO-EFNB2 is 0.02 159/9786

For CHO-EFNB3:
There are 10108 amino acid mutations possible
fraction muts present in CHO-EFNB3 is 0.96 9683/10108
The number of deleterious mutants for CHO-EFNB3 is 0.43 4178/9683
The number of neutral mutants for CHO-EFNB3 is 0.56 5415/9683
The number of positive mutants for CHO-EFNB3 is 0.01 66/9683

Find which sites only have negative entry scores for all mutants¶

In [9]:
def overall_stats_all_neg(df,effect,name,region=None):
    print(f'We are analyzing data for {name}:')
    
    if region is None:
        contact_df = df
    else:
        contact_df = df[df['site'].isin(region)]
    
    filtered_df = contact_df.groupby('site').filter(lambda group: (group[effect] < 0).all())
    unique = filtered_df['site'].unique()
    print(list(unique))
    total_sites = contact_df['site'].unique().shape[0]
    subset = filtered_df['site'].unique().shape[0]
       
    fraction = subset/total_sites
    percent = fraction * 100
    print(f'The total number of sites are: {total_sites}\n')
    print(f' The number of sites where all mutants are negative for {effect}: {subset}\n')
    print(f' The percent of sites where all mutants are negative for {effect}: {percent:.0f}\n')
    return unique

intolerant_sites_E2 = list(overall_stats_all_neg(func_scores_E2,'effect','CHO-bEFNB2'))
intolerant_sites_E3 = list(overall_stats_all_neg(func_scores_E3,'effect','CHO-bEFNB3'))
We are analyzing data for CHO-bEFNB2:
[80, 95, 97, 106, 107, 111, 112, 113, 120, 121, 125, 126, 127, 128, 130, 138, 146, 151, 157, 158, 159, 160, 161, 162, 163, 165, 167, 172, 189, 203, 205, 206, 207, 208, 216, 229, 240, 246, 251, 253, 254, 257, 258, 259, 260, 262, 263, 264, 266, 267, 298, 303, 320, 323, 331, 347, 355, 382, 387, 395, 412, 439, 454, 460, 467, 487, 489, 493, 499, 500, 503, 506, 510, 537, 563, 565, 573, 574, 575, 594, 598]
The total number of sites are: 532

 The number of sites where all mutants are negative for effect: 81

 The percent of sites where all mutants are negative for effect: 15

We are analyzing data for CHO-bEFNB3:
[100, 107, 108, 109, 111, 112, 113, 120, 121, 126, 138, 146, 158, 159, 162, 163, 206, 207, 208, 216, 220, 229, 236, 240, 243, 251, 253, 257, 258, 259, 260, 263, 266, 276, 303, 347, 352, 355, 368, 382, 387, 389, 395, 412, 438, 439, 458, 460, 467, 486, 487, 488, 489, 493, 494, 499, 500, 501, 503, 504, 505, 506, 510, 526, 531, 532, 533, 537, 556, 557, 565, 573, 574, 579, 581, 584, 588, 594]
The total number of sites are: 532

 The number of sites where all mutants are negative for effect: 78

 The percent of sites where all mutants are negative for effect: 15

Make bubble plots of receptor contact site type (median values per site)¶

In [10]:
def make_bubbleplot_entry_region(df):  # Create a bubble plot using Altair for contact site mutants
    barrel_ranges = {
        "Hydrophobic": config["hydrophobic"],
        "Salt Bridges": config["salt_bridges"],
        "Hydrogen Bonds": config["h_bond_total"],
        "Contact": config["contact_sites"],
        #"Overall": list(range(71, 602)),
    }
    custom_order = [
        "Hydrophobic",
        "Salt Bridges",
        "Hydrogen Bonds",
        "Contact",
        "Overall",
    ]
    empty_chart = []
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, fields=["site"], value=1
    )
    for selection in ["CHO-EFNB2", "CHO-EFNB3"]:
        agg_means = []
        tmp_df = df[df["cell_type"] == selection]
        mean_df = tmp_df.groupby("site")[["effect"]].mean().reset_index()

        # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
        for barrel, sites in barrel_ranges.items():
            subset = mean_df[mean_df["site"].isin(sites)]
            for _, row in subset.iterrows():
                agg_means.append(
                    {"barrel": barrel, "effect": row["effect"], "site": row["site"]}
                )
        agg_means_df = pd.DataFrame(agg_means)
        chart = (
            alt.Chart(agg_means_df, title=f"{selection}")
            .mark_point()
            .encode(
                x=alt.X(
                    "barrel:O",
                    sort=custom_order,
                    title='Contact Type',
                    axis=alt.Axis(labelAngle=-90),
                ),
                y=alt.Y(
                    "effect",
                    title="Median Cell Entry",
                    axis=alt.Axis(grid=True, tickCount=4),
                ),
                xOffset="random:Q",
                tooltip=["barrel", "effect", "site"],
                size=alt.condition(
                    variant_selector, alt.value(100), alt.value(25)
                ),
                color=alt.condition(
                    variant_selector, alt.value("orange"), alt.value("black")
                ),
                opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.3)),
            ).properties(width=config['width'],height=config['height'])
            .transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
        )
        empty_chart.append(chart)
    combined_effect_chart = (
        alt.hconcat(*empty_chart)
        .resolve_scale(y="shared", x="shared", color="independent")
        .add_params(variant_selector)
    )
    return combined_effect_chart


tmp_img = make_bubbleplot_entry_region(concat_df)
tmp_img.display()
if entry_region_boxplot_plot is not None:
    tmp_img.save(contact_type_plot)

Make bubble plots depending on amino acid property¶

In [11]:
def make_bubbleplot_wildtype_prop(df):
    barrel_ranges = {
        "hydrophobic": list(["A", "V", "L", "I", "M"]),
        "aromatic": list(["Y", "W", "F"]),
        "positive": list(["K", "R", "H"]),
        "negative": list(["E", "D"]),
        "hydrophilic": list(["S", "T", "N", "Q"]),
        "special": list(["C", "P", "G"]),
    }
    empty_charts = []
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["site"], value=1
    )
    for selection in ["CHO-EFNB2", "CHO-EFNB3"]:
        if selection == "CHO-EFNB2":
            effect_name = "EFNB2"
        else:
            effect_name = "EFNB3"

        tmp_df = df[df["cell_type"] == selection]

        unique_wildtype_df = tmp_df[["site", "wildtype"]].drop_duplicates()
        mean_df = tmp_df.groupby("site")[["effect"]].median().reset_index()
        mean_df = pd.merge(mean_df, unique_wildtype_df, on="site", how="left")

        agg_means = []

        # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
        for barrel, sites in barrel_ranges.items():
            subset = mean_df[mean_df["wildtype"].isin(sites)]
            for _, row in subset.iterrows():
                agg_means.append(
                    {"wildtype_class": barrel, "effect": row["effect"], "site": row["site"], "wildtype": row["wildtype"]}
                )
        agg_means_df = pd.DataFrame(agg_means)

        chart = (
            alt.Chart(agg_means_df, title=f"{selection}")
            .mark_point()
            .encode(
                x=alt.X(
                    "wildtype_class:O",
                    title="Wildtype amino acid property",
                    axis=alt.Axis(labelAngle=-90),
                ),  
                y=alt.Y(
                    "effect",
                    title=f"Median Cell Entry",
                    axis=alt.Axis(grid=True, tickCount=4),
                ),
                xOffset="random:Q",
                tooltip=["wildtype_class", "effect", "site","wildtype"],
                size=alt.condition(variant_selector, alt.value(100), alt.value(25)),
                color=alt.condition(
                    variant_selector, alt.value("orange"), alt.value("black")
                ),
                opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.2)),
            ).properties(width=config['width'],height=config['height'])
            .transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
            
        )
        empty_charts.append(chart)
    combined_effect_chart = (
        alt.hconcat(*empty_charts)
        .resolve_scale(y="shared", x="shared", color="independent")
        .add_params(variant_selector)
    )
    return combined_effect_chart


wildtype_aa_bubble_img = make_bubbleplot_wildtype_prop(concat_df)
wildtype_aa_bubble_img.display()

Plot correlations between E2 and E3 entry¶

In [12]:
# Import distance data
e2_distances = pd.read_csv(e2_distances_file)
distance_df = pd.merge(
    merged_df, e2_distances[["site", "distance"]], on="site", how="left"
)

def determine_distance(df):
    # Define the conditions
    conditions = [
        df["distance"] < 4,
        (df["distance"] >= 4) & (df["distance"] <= 8),
        df["distance"] > 8,
    ]

    # Define the associated values for the conditions
    choices = ["contact", "close", "distant"]

    # Apply the conditions and choices to the 'E2_contact' column
    df["contact"] = np.select(conditions, choices, default="distant")
    return df


distance_df = determine_distance(distance_df)


def median_correlation_plot(df, metric):
    aggregation = getattr(df.groupby("site")[["effect_E2", "effect_E3"]], metric)
    means = aggregation().reset_index()
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        means["effect_E2"], means["effect_E3"]
    )
    display(r_value.round(2))

    means = means.rename(
        columns={"effect_E2": f"{metric}_E2", "effect_E3": f"{metric}_E3"}
    )

    contact_sites = df[["site", "contact", "wildtype"]].drop_duplicates()
    df_mean = pd.merge(means, contact_sites, on="site", how="left")

    chart = (
        alt.Chart(df_mean)
        .mark_point(size=40,stroke='black',strokeWidth=1)
        .encode(
            x=alt.X(f"{metric}_E2", title="Entry in CHO-bEFNB2"),
            y=alt.Y(f"{metric}_E3", title="Entry in CHO-bEFNB3"),
            tooltip=["site", "wildtype"],
            color=alt.Color(
                "contact",
                scale=alt.Scale(
                    domain=["contact", "close", "distant"],
                    range=["#1f4e79", "#ff7f0e", "gray"],
                ),
                legend=alt.Legend(title="RBP Distance to Receptor"),
            ),
        )
    )
    min_ = int(df_mean[f"{metric}_E2"].min())
    max_ = int(df_mean[f"{metric}_E3"].max())
    text = (
        alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]})
        .mark_text(
            align="left",
            baseline="top",
            dx=0,  # Adjust this for position
            dy=0,  # Adjust this for position
        )
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    plot = chart + text
    return plot


E2_E3_plot = median_correlation_plot(distance_df, "mean")
E2_E3_plot.display()
if entry_region_boxplot_plot is not None:
    E2_E3_plot.save(E2_E3_entry_corr_plot)


def correlation_plot(df):
    df = df.dropna()
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        df["effect_E2"], df["effect_E3"]
    )
    display(r_value.round(2))
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["contact"], value=1
    )
    chart = (
        alt.Chart(df)
        .mark_point(opacity=0.2,size=15)
        .encode(
            x=alt.X("effect_E2", title="Cell Entry for EFNB2"),
            y=alt.Y("effect_E3", title="Cell Entry for EFNB3"),
            tooltip=["site", "wildtype", "mutant"],
            color=alt.Color(
                "contact",
                scale=alt.Scale(
                    domain=["contact", "close", "distant"],
                    range=["#1f4e79", "#ff7f0e", "gray"],
                ),
                legend=alt.Legend(title="RBP Distance to Receptor"),
            ),
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.2)),
        ).add_params(variant_selector)
    )
    min_ = int(df['effect_E2'].min())
    max_ = int(df['effect_E3'].max())
    
    text = (
        alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]})
        .mark_text(
            align="left",
            baseline="top",
            dx=-20,  # Adjust this for position
            dy=-20,  # Adjust this for position
        )
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    plot = chart + text

    return plot


tmp_img_corr = correlation_plot(distance_df)
tmp_img_corr.display()
if entry_region_boxplot_plot is not None:
    tmp_img_corr.save(E2_E3_entry_all_muts_plot)

if entry_region_boxplot_plot is not None:
    (E2_E3_plot | tmp_img_corr).save(combined_E2_E3_correlation_plots)
0.82
0.79

Make boxplot showing median entry by RBP region¶

In [13]:
def make_boxplot_entry_region(df):
    barrel_ranges = {
        "Stalk": list(range(70, 147)),
        "Neck": list(range(148, 165)),
        "Linker": list(range(166, 177)),
        "Head": list(range(178, 602)),
    }
    custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
    empty_charts = []
    for selection in ["CHO-EFNB2", "CHO-EFNB3"]:
        if selection == "CHO-EFNB2":
            effect_name = "EFNB2"
        else:
            effect_name = "EFNB3"

        tmp_df = df[df["cell_type"] == selection]
        agg_means = []
        #tmp_df = tmp_df.groupby('site')['effect'].median().reset_index()
        # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
        for barrel, sites in barrel_ranges.items():
            subset = tmp_df[tmp_df["site"].isin(sites)]
            for _, row in subset.iterrows():
                agg_means.append(
                    {"region": barrel, "effect": row["effect"], "site": row["site"]}
                )
            agg_means_df = pd.DataFrame(agg_means)
        chart = (
            alt.Chart(agg_means_df, title=f"{selection}")
            .mark_boxplot(color='darkgray')
            .encode(
                x=alt.X(
                    "region:O",
                    sort=custom_order,
                    title="RBP Region",
                    axis=alt.Axis(labelAngle=-90),
                ),
                y=alt.Y(
                    "effect",
                    title=f"Cell Entry",
                    axis=alt.Axis(grid=True, tickCount=4),
                ),
                tooltip=["region", "effect", "site"],
            ).properties(width=config['width'],height=config['height'])
        )
        empty_charts.append(chart)
    combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
        y="shared", x="shared", color="independent"
    )
    return combined_effect_chart


entry_region_boxplot = make_boxplot_entry_region(concat_df)
entry_region_boxplot.display()
if entry_region_boxplot_plot is not None:
    entry_region_boxplot.save(entry_region_boxplot_plot)

Check for potential neutral/beneficial glycosylation sites¶

In [14]:
def find_potential_glycan_sites(df):
    filtered = df[df["wildtype"].isin(["T", "S"])]
    matching_sites = []
    for index, row in filtered.iterrows():
        # Check for existence of site two numbers prior with 'N' mutant and positive effect
        prior_rows = df[
            (df["site"] == row["site"] - 2) & (df["mutant"] == "N") & (df["effect"] > 0)
        ]
        if not prior_rows.empty:
            matching_sites.append(row["site"])
    unique_list1 = list(set(matching_sites))
    unique_list1 = [x - 2 for x in unique_list1]

    filtered = df[df["wildtype"].isin(["N"])]
    matching_sites = []
    for index, row in filtered.iterrows():
        # Check for existence of site two numbers prior with 'N' mutant and positive effect
        prior_rows = df[
            (df["site"] == row["site"] + 2)
            & (df["mutant"].isin(["T", "S"]))
            & (df["effect"] > 0)
        ]
        if not prior_rows.empty:
            matching_sites.append(row["site"])
    unique_list2 = list(set(matching_sites))
    unique_list = unique_list1 + unique_list2
    unique_list.sort()
    return unique_list
#call function
PNLG = find_potential_glycan_sites(func_scores_E3)
#read in surface exposure data to find potential glycans on surface of protein
surface_df = pd.read_csv(surface)
surface_df.rename(columns={"exposure_A": "Surface Exposure"}, inplace=True)
PNLG_SURFACE = surface_df[surface_df["site"].isin(PNLG)]
PNLG_SURFACE = list(
    PNLG_SURFACE[PNLG_SURFACE["Surface Exposure"] >= 25]["site"].unique()
)

#print(f"\nThe surface exposed PNLG sites are: {PNLG_SURFACE}\n")

glycans = config["glycans"] #these are the glycan sites that are already present in protein

#filter out known glycan sites already present
filtered_PNLG_SURFACE = [num for num in PNLG_SURFACE if num not in glycans]
print(f'These are {len(filtered_PNLG_SURFACE)} potential glycan sites one amino acid away: {filtered_PNLG_SURFACE}')
#print(len(filtered_PNLG_SURFACE))
These are 15 potential glycan sites one amino acid away: [180, 191, 192, 311, 326, 359, 383, 403, 423, 473, 478, 543, 554, 570, 600]

Make bar plot of average entry scores for CHO-bEFNB3¶

In [15]:
def entry_by_site(df):
    tmp_df = df.groupby('site')['effect'].mean().reset_index()
    # define ranges of different RBP regions
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 602)),
    }
    
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
    
    agg_means = [] #store aggregation 
    
    # For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = tmp_df[tmp_df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {"region": barrel, "effect": row["effect"], "site": row["site"]}
            )
        agg_means_df = pd.DataFrame(agg_means).round(3)
    display(agg_means_df)
    agg_means_df['beta_sheet'] = agg_means_df['site'].isin(config['beta_sheet']) #add a column specifying which sites are in beta sheets
    
    ### The main chart plotting
    chart = (
        alt.Chart(agg_means_df)
        .mark_bar(opacity=1)
        .encode(
            x=alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            y=alt.Y("effect", title="Mean Cell Entry"),
            tooltip=["site", "effect","region"],
            color=alt.Color('region',sort=custom_order,title='Region'),
        )
    ).properties(width=1000,height=200)
    
    ### Draw rectanges showing where beta sheets are in protein above chart
    rect = alt.Chart(agg_means_df.query('beta_sheet == True')).mark_rect(color='black').encode(
        alt.X('site:N',axis=None),
        alt.Y('beta_sheet',axis=None)
    ).properties(width=1000,height=10)
    
    combined_chart = alt.vconcat(rect,chart,padding=0).resolve_scale(y='independent',x='shared')
    

    return combined_chart

entry_by_site_plot = entry_by_site(func_scores_E3)
entry_by_site_plot.display()
region effect site
0 Stalk -0.618 71.0
1 Stalk -0.759 72.0
2 Stalk -0.461 73.0
3 Stalk -0.472 74.0
4 Stalk -0.295 75.0
... ... ... ...
526 Head -2.056 597.0
527 Head -0.824 598.0
528 Head 0.118 599.0
529 Head 0.178 600.0
530 Head -0.837 601.0

531 rows × 3 columns

In [16]:
display(concat_df)
site wildtype mutant effect effect_std times_seen n_selections cell_type wildtype_site wt_type mutant_type
0 71 Q C -1.75 0.18 4.62 8 CHO-EFNB2 Q71 hydrophilic special
1 71 Q D -1.16 0.89 4.50 8 CHO-EFNB2 Q71 hydrophilic negative
2 71 Q E -1.25 0.31 5.38 8 CHO-EFNB2 Q71 hydrophilic negative
3 71 Q F -1.06 0.66 4.62 8 CHO-EFNB2 Q71 hydrophilic aromatic
4 71 Q G -1.42 0.59 7.88 8 CHO-EFNB2 Q71 hydrophilic special
... ... ... ... ... ... ... ... ... ... ... ...
19464 602 T R 0.41 0.18 6.29 7 CHO-EFNB3 T602 hydrophilic positive
19465 602 T S 0.38 0.16 3.43 7 CHO-EFNB3 T602 hydrophilic hydrophilic
19466 602 T V 0.41 0.13 5.43 7 CHO-EFNB3 T602 hydrophilic hydrophobic
19467 602 T W 0.52 0.10 6.86 7 CHO-EFNB3 T602 hydrophilic aromatic
19468 602 T Y 0.41 0.24 6.14 7 CHO-EFNB3 T602 hydrophilic aromatic

19469 rows × 11 columns

Make bar charts of mean cell entry by site for different regions, sorted by average and colored by the unmutated amino acid at position¶

In [17]:
display(concat_df.query('site == 72'))
site wildtype mutant effect effect_std times_seen n_selections cell_type wildtype_site wt_type mutant_type
18 72 N A -0.46 0.63 9.12 8 CHO-EFNB2 N72 hydrophilic hydrophobic
19 72 N C -2.84 0.07 8.38 8 CHO-EFNB2 N72 hydrophilic special
20 72 N D -2.78 0.06 12.75 8 CHO-EFNB2 N72 hydrophilic negative
21 72 N E 0.13 0.34 8.88 8 CHO-EFNB2 N72 hydrophilic negative
22 72 N F -2.33 0.82 10.88 8 CHO-EFNB2 N72 hydrophilic aromatic
23 72 N G -0.11 0.17 9.88 8 CHO-EFNB2 N72 hydrophilic special
24 72 N H -0.49 0.22 11.00 8 CHO-EFNB2 N72 hydrophilic positive
25 72 N I -0.99 0.67 11.50 8 CHO-EFNB2 N72 hydrophilic hydrophobic
26 72 N K -0.32 0.19 11.75 8 CHO-EFNB2 N72 hydrophilic positive
27 72 N L -1.26 0.20 7.88 8 CHO-EFNB2 N72 hydrophilic hydrophobic
28 72 N M 0.09 0.30 5.38 8 CHO-EFNB2 N72 hydrophilic hydrophobic
29 72 N P -3.31 0.00 6.75 8 CHO-EFNB2 N72 hydrophilic special
30 72 N Q 0.29 0.39 6.75 8 CHO-EFNB2 N72 hydrophilic hydrophilic
31 72 N R -1.38 0.60 6.38 8 CHO-EFNB2 N72 hydrophilic positive
32 72 N S 0.29 0.20 11.75 8 CHO-EFNB2 N72 hydrophilic hydrophilic
33 72 N T -1.43 0.74 3.62 8 CHO-EFNB2 N72 hydrophilic hydrophilic
34 72 N V -3.16 0.00 5.12 8 CHO-EFNB2 N72 hydrophilic hydrophobic
35 72 N W -2.06 0.30 8.62 8 CHO-EFNB2 N72 hydrophilic aromatic
36 72 N Y -1.27 0.26 18.12 8 CHO-EFNB2 N72 hydrophilic aromatic
9802 72 N A -0.12 0.58 11.43 7 CHO-EFNB3 N72 hydrophilic hydrophobic
9803 72 N C -2.37 0.25 6.14 7 CHO-EFNB3 N72 hydrophilic special
9804 72 N D -1.39 0.61 9.71 7 CHO-EFNB3 N72 hydrophilic negative
9805 72 N E 0.29 0.13 11.57 7 CHO-EFNB3 N72 hydrophilic negative
9806 72 N F -1.12 0.48 8.14 7 CHO-EFNB3 N72 hydrophilic aromatic
9807 72 N G -0.19 0.17 10.00 7 CHO-EFNB3 N72 hydrophilic special
9808 72 N H -0.02 0.38 8.57 7 CHO-EFNB3 N72 hydrophilic positive
9809 72 N I -0.28 0.60 7.57 7 CHO-EFNB3 N72 hydrophilic hydrophobic
9810 72 N K -0.20 0.45 12.43 7 CHO-EFNB3 N72 hydrophilic positive
9811 72 N L -0.72 0.41 7.14 7 CHO-EFNB3 N72 hydrophilic hydrophobic
9812 72 N M -0.15 0.52 5.71 7 CHO-EFNB3 N72 hydrophilic hydrophobic
9813 72 N P -3.50 0.00 7.86 7 CHO-EFNB3 N72 hydrophilic special
9814 72 N Q 0.50 0.12 6.29 7 CHO-EFNB3 N72 hydrophilic hydrophilic
9815 72 N R -0.56 0.75 8.29 7 CHO-EFNB3 N72 hydrophilic positive
9816 72 N S 0.39 0.15 11.71 7 CHO-EFNB3 N72 hydrophilic hydrophilic
9817 72 N T -1.37 0.51 6.57 7 CHO-EFNB3 N72 hydrophilic hydrophilic
9818 72 N V -2.63 0.08 7.57 7 CHO-EFNB3 N72 hydrophilic hydrophobic
9819 72 N W -0.67 0.96 4.29 7 CHO-EFNB3 N72 hydrophilic aromatic
9820 72 N Y -0.32 0.23 12.71 7 CHO-EFNB3 N72 hydrophilic aromatic
In [18]:
def entry_by_site_region(df,site_list,name_of_title,horizontal_width):
    #calculate mean by site
    tmp_df = df.groupby(['site','cell_type'])['effect'].mean().reset_index().round(3) 
    #make a unique values df to merge
    unique_wildtypes_df = df.drop_duplicates(subset=['site','wildtype','wt_type','wildtype_site'])
    #merge mean and unique values
    tmp_df = pd.merge(tmp_df, unique_wildtypes_df[['site', 'wt_type','wildtype','wildtype_site']], on='site', how='left')

    #filter by site
    tmp_df = tmp_df[tmp_df['site'].isin(site_list)]
    #sort sites
    tmp_df = tmp_df.sort_values(by='effect', ascending=False)
    #make chart
    chart = (
        alt.Chart(tmp_df,title=name_of_title)
        .mark_bar(opacity=1,color='gray')
        .encode(
            x=alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90)),
            y=alt.Y("effect", title="Mean Cell Entry"),
            tooltip=["site", "effect", 'wildtype', 'wt_type'],
            color=alt.Color('wt_type:N', scale=alt.Scale(scheme='tableau10'), legend=alt.Legend(title="Wildtype Residue Type")),
            #row=alt.Row('cell_type',title=None)

        )
    ).properties(width=alt.Step(15), height=alt.Step(10))
    
    return chart

Ranked average entry in contact sites¶

In [19]:
entry_by_site_receptor_plot = entry_by_site_region(func_scores_E3,config['contact_sites'],'Contact Sites',400)
entry_by_site_receptor_plot.display()

Ranked average entry in dimerization interface¶

In [20]:
entry_by_site_receptor_plot_dimer = entry_by_site_region(func_scores_E3,config['dimerization'],'Dimerization Interface',500)
entry_by_site_receptor_plot_dimer.display()

Now make ranked averages for each region¶

Ranked average entry in RBP stalk¶

In [21]:
entry_by_site_receptor_plot_stalk = entry_by_site_region(func_scores_E3,list(range(70, 148)),'Stalk',950)
entry_by_site_receptor_plot_stalk.display()

Ranked average entry in RBP neck¶

In [22]:
entry_by_site_receptor_plot_neck = entry_by_site_region(func_scores_E3,list(range(147,175)),'Neck',400)
entry_by_site_receptor_plot_neck.display()

Ranked average entry in RBP linker¶

In [23]:
entry_by_site_receptor_plot_linker = entry_by_site_region(func_scores_E3,list(range(166, 178)),'Linker',300)
entry_by_site_receptor_plot_linker.display()
In [24]:
combined_region_barplot = alt.vconcat(entry_by_site_receptor_plot_stalk,entry_by_site_receptor_plot_neck,entry_by_site_receptor_plot_linker,entry_by_site_receptor_plot).resolve_scale(y="shared", x="independent", color="shared")
combined_region_barplot.display()
In [ ]:
 
In [ ]: